# ------------------------------------------------------------------------------
# Gets representative chief complaints
# Author: Cassidy Shubatt <cshubatt@gmail.com>
# To run: bash 01_get_representative_ccs.sh
# ------------------------------------------------------------------------------

# Libraries --------------------------------------------------------------------
message("Loading libraries...")
library(here)
library(yaml)
library(tidyverse)
library(glue)
library(xtable)

temp <- here("code", "07_other_physician_errors", "temp")
u <- modules::use(here("lib", "util.R"))

# Load Data --------------------------------------------------------------------
message("Loading data...")
overnight_lab <- ""
paths <- read_yaml(here::here("lib", "filepaths.yml"))
cc <- readRDS(paths$analysis$cc) %>% select(-cc_arrest_cardiac)
cc_labels <- read_csv(here("lib", "cc_labels.csv"))

full_cohort <- readRDS(glue(paths$analysis$full_cohort)) %>%
  u$safe_left_join(cc) %>%
  filter(!exclude)

# CC Frequency -----------------------------------------------------------------
message("Getting frequency of CCs in full pop vs. treated...")
cc_vars_all <- names(cc) %>%
  setdiff(
    c(
      "ptid", "ed_enc_id", "cc_arrest_cardiac", "chief_complaint_sup",
      "total_cc"
    )
  )

cc_freq_all <- full_cohort %>%
  select(all_of(cc_vars_all)) %>%
  colSums
cc_freq_all <- cc_freq_all[cc_freq_all > 0]
cc_vars <- names(cc_freq_all)

freq_rank_df <- tibble(
  chief_complaint = names(cc_freq_all),
  frequency = cc_freq_all/nrow(full_cohort)
) %>%
  mutate(freq_rank = rank(-frequency)) %>%
  .[order(.$freq_rank),]
write_rds(freq_rank_df, file.path(temp, "cc_frequency_df.rds"))

cc_freq_tested <- full_cohort %>%
  filter(test_010_day) %>%
  select(all_of(cc_vars)) %>%
  colSums
# univariate coefs
ccs_in_tested <- cc_freq_tested[cc_freq_tested > 0]

cc_freq_treated <- full_cohort %>%
  filter(stent_or_cabg_010_day) %>%
  select(all_of(cc_vars)) %>%
  colSums

# Representativeness -----------------------------------------------------------
message("Getting symptom representativeness...")
cc_rank_df <- tibble(
  chief_complaint = cc_vars,
  freq_all = (100 * cc_freq_all/nrow(full_cohort)) %>% round(digits = 3),
  freq_treated = (100 * cc_freq_treated/sum(full_cohort$stent_or_cabg_010_day)) %>% round(digits = 3)
) %>%
mutate(rep_ratio = (freq_treated/freq_all) %>% round(digits = 4)) %>%
.[order(-.$rep_ratio),] %>%
# occurs in more than 0.5% of treated
  filter(freq_treated > 0.5)

representative_ccs <- cc_rank_df %>%
  filter(rep_ratio > 1) %>%
  .['chief_complaint']

print(cc_rank_df, n = 20)

xt <- cc_rank_df %>%
  u$safe_left_join(cc_labels) %>%
  select(cc_label, freq_all, rep_ratio) %>%
  xtable

# Save -------------------------------------------------------------------------
message("Saving...")
write_csv(cc_rank_df, file.path(temp, "cc_representativeness.csv"))

rep_ccs <- representative_ccs$chief_complaint
rep_ccs_featnames <- glue("ed_enc_t0d_count_{rep_ccs}")
write_rds(rep_ccs_featnames, paths$analysis$representative_vars)

print(
  xt, type = "latex", file = file.path(temp, "symptom_representativeness.tex"),
  include.rownames = FALSE
)

message("Done.")
